{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "frQ5it0RmB0_"
      },
      "source": [
        "# Semantic Search with Cohere Embed Jobs and Pinecone serverless Solution"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Fdhi1O4lrqaV"
      },
      "outputs": [],
      "source": [
        "# TODO: upgrade to \"cohere>5\"\n",
        "! pip install \"cohere<5\" pinecone-client==3.0.0.dev3 -q"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "N_bYWQORGuMM",
        "outputId": "ee9f81f3-e896-46ff-e287-48502cff0b29"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/pinecone/data/index.py:1: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
            "  from tqdm.autonotebook import tqdm\n"
          ]
        }
      ],
      "source": [
        "import os\n",
        "import json\n",
        "import time\n",
        "import numpy as np\n",
        "import cohere\n",
        "from pinecone import Pinecone\n",
        "\n",
        "co = cohere.Client('COHERE_API_KEY')\n",
        "pc = Pinecone(api_key=('PINECONE_API_KEY'))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XulUR7tt69RM"
      },
      "source": [
        "## Step 1: Upload a dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "G677sKZc6NDv",
        "outputId": "e4bfd9ca-590a-4cf7-d9cf-3657a16e2a7f"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "uploading file, starting validation...\n",
            "sample-file-2gwgxq was uploaded\n",
            "...\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "cohere.Dataset {\n",
            "\tid: sample-file-2gwgxq\n",
            "\tname: sample_file\n",
            "\tdataset_type: embed-input\n",
            "\tvalidation_status: validated\n",
            "\tcreated_at: 2024-01-13 02:47:32.563080\n",
            "\tupdated_at: 2024-01-13 02:47:32.563081\n",
            "\tdownload_urls: ['https://storage.googleapis.com/cohere-user/dataset-api-temp/d489c39a-e152-49da-9ddc-9801bd74d823/96d12a16-2dd4-46f7-9630-1fa9bb0b26ca/sample-file-2gwgxq/000_embed_jobs_sample_data.avro?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=dataset%40cohere-production.iam.gserviceaccount.com%2F20240113%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240113T024743Z&X-Goog-Expires=14399&X-Goog-Signature=1cc013824d54e4974600e19ec5c5dd6624c3de0c512f1c7d204fa95056b09e19d90c0fe034c59f0eef8bdfd4e1ed03b44c601d290e9b9c9e0643afd7a44fe97c42750304a8f199c9d87abb50fc74d777ab2d36efecca64ad97264f9e0628f2e199b9eb2d505241480a7436191cacaa78efb328567c9303469b653962caf07e6b11fac06ed06bd4597377e87fac58214bab1cd5cde63e19508d903e65ad654a177e27a64105b79c56d0cc156a35b61d45a7dda3b9819ef78dc9861c818b808527a1e16210dc83130be630f1b54c75d280ea20d070566056a6b2c7b1a016409482defc2a1942a801e46f5349adfadb244711da80d103ed831d6927366adc0c1659&X-Goog-SignedHeaders=host']\n",
            "\tvalidation_error: None\n",
            "\tvalidation_warnings: []\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "# Upload a dataset for embed jobs\n",
        "dataset_file_path = \"data/embed_jobs_sample_data.jsonl\" # Full path - https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/embed_jobs_sample_data.jsonl\n",
        "\n",
        "ds=co.create_dataset(\n",
        "\tname='sample_file',\n",
        "\t# insert your file path here - you can upload it on the right - we accept .csv and jsonl files\n",
        "\tdata=open(dataset_file_path, 'rb'),\n",
        "\tdataset_type=\"embed-input\"\n",
        "\t)\n",
        "\n",
        "print(ds.await_validation())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5VAoaVo47Bmi"
      },
      "source": [
        "## Step 2: Create embeddings via Cohere's Embed Jobs endpoint"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "77Vw5BdWGzFl",
        "outputId": "a5e0c9f5-8172-4b80-a8ea-aa4e763fdeda"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "...\n",
            "...\n"
          ]
        }
      ],
      "source": [
        "# Dataset has been uploaded, create an embed job and specify the input type as \"search document\" since this will live in your Pinecone DB\n",
        "job = co.create_embed_job(dataset_id=ds.id,\n",
        "                          input_type='search_document',\n",
        "                          model='embed-english-v3.0',\n",
        "                          embeddings_types=['float'])\n",
        "\n",
        "job.wait() # poll the server until the job is completed "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d__jANfVvNld",
        "outputId": "c4dcd936-c338-47b3-994d-71cdceaa6796"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "cohere.EmbedJob {\n",
            "\tjob_id: 6d691fbe-e026-436a-826a-16e70b293e51\n",
            "\tstatus: complete\n",
            "\tcreated_at: 2024-01-13T02:47:46.385016Z\n",
            "\tinput_dataset_id: sample-file-2gwgxq\n",
            "\toutput_urls: None\n",
            "\tmodel: embed-english-v3.0\n",
            "\ttruncate: RIGHT\n",
            "\tpercent_complete: 100\n",
            "\toutput: cohere.Dataset {\n",
            "\tid: embeded-sample-file-mdse2h\n",
            "\tname: embeded-sample-file\n",
            "\tdataset_type: embed-result\n",
            "\tvalidation_status: validated\n",
            "\tcreated_at: 2024-01-13 02:47:47.850097\n",
            "\tupdated_at: 2024-01-13 02:47:47.850097\n",
            "\tdownload_urls: ['https://storage.googleapis.com/cohere-user/dataset-api-temp/d489c39a-e152-49da-9ddc-9801bd74d823/96d12a16-2dd4-46f7-9630-1fa9bb0b26ca/embeded-sample-file-mdse2h/001_embeded-sample-file.avro?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=dataset%40cohere-production.iam.gserviceaccount.com%2F20240113%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240113T024807Z&X-Goog-Expires=14399&X-Goog-Signature=78b0d82e19388aee2926a4ef403d5f286487d0e7fc9a09c75bfb6700b502fa4f55b024977c0aeae26cd501a41e506bf00d3e850d7d9691298963374f04fa129e8dae5a327389c80921bf611a25386f5e4501b11c9d88bc1aee6f6877157a4675c5f8fe06bb25e3ca2b12da46e5b1da3067ca71bed901cd14db26e09987e355c039f96d6514a0931aa5f8753ddc155ca1782c63e3cb000b095d3b29904982ff75686c716329e92b6946485c567dabc3e344c9a0f9a59416415738b67ead0cca3cdb06c1db64c925ea38a2d92ab4079577e5775367260c09916aab5af67326bca4fa1295ee76457f933a6ca26a5d4ac9c59f0f73286627b2bae3e7fa7375c97465&X-Goog-SignedHeaders=host']\n",
            "\tvalidation_error: None\n",
            "\tvalidation_warnings: []\n",
            "}\n",
            "}\n"
          ]
        }
      ],
      "source": [
        "print(job)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UlEVKOCG7ZsN"
      },
      "source": [
        "## Step 3: Prepare embeddings for upsert"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vDvntjsnG_DX"
      },
      "outputs": [],
      "source": [
        "# Load the output file into an array\n",
        "output_dataset=co.get_dataset(job.output.id)\n",
        "data_array = []\n",
        "for record in output_dataset:\n",
        "  data_array.append(record)\n",
        "\n",
        "# Take the output and format it in the shape for upserting into Pinecone's DB\n",
        "ids = [str(i) for i in range(len(data_array))]\n",
        "meta = [{'text':str(data_array[i]['text'])} for i in range(len(data_array))]\n",
        "embeds=[np.float32(data_array[i]['embeddings']['float']) for i in range(len(data_array))]\n",
        "\n",
        "to_upsert = list(zip(ids, embeds, meta))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pQwXK4xt7lls"
      },
      "source": [
        "## Step 4: Initialize Pinecone vector database"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GrLN8Y_THEBH"
      },
      "outputs": [],
      "source": [
        "# Initialize your Pinecone Vector DB\n",
        "from pinecone import ServerlessSpec\n",
        "\n",
        "index_name = \"embed-jobs-serverless-test-example\"\n",
        "\n",
        "# A new property 'spec' is used to tell Pinecone how we should deploy your index.\n",
        "pc.create_index(\n",
        "name=index_name,\n",
        "dimension=1024,\n",
        "metric=\"cosine\",\n",
        "spec=ServerlessSpec(cloud='aws', region='us-west-2')\n",
        ")\n",
        "\n",
        "# Target your new serverless index.\n",
        "idx = pc.Index(index_name)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "W2QMVhoM7o-5"
      },
      "source": [
        "## Step 5: Upsert embeddings into the index"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JaVrXHZ3IjiN",
        "outputId": "716ad587-2ee1-475a-f1e5-9a0b23d2df87"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "{'dimension': 1024,\n",
            " 'index_fullness': 0.0,\n",
            " 'namespaces': {'': {'vector_count': 3664}},\n",
            " 'total_vector_count': 3664}\n"
          ]
        }
      ],
      "source": [
        "# Upsert your data into the index\n",
        "batch_size = 128\n",
        "\n",
        "for i in range(0, len(data_array), batch_size):\n",
        "    i_end = min(i+batch_size, len(data_array))\n",
        "    idx.upsert(vectors=to_upsert[i:i_end])\n",
        "\n",
        "# let's view the index statistics\n",
        "print(idx.describe_index_stats())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AtT1Yo_i8CZc"
      },
      "source": [
        "## Step 6: Query the index"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "t-A82Z1EIrKR",
        "outputId": "ed708eef-0c0b-4688-f0d7-f5fc39219848"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "(1, 1024)\n"
          ]
        }
      ],
      "source": [
        "# Let's query the database\n",
        "query = \"What did Microsoft announce in Las Vegas?\"\n",
        "\n",
        "# create the query embedding\n",
        "xq = co.embed(\n",
        "    texts=[query],\n",
        "    model='embed-english-v3.0',\n",
        "    input_type='search_query',\n",
        "    truncate='END'\n",
        ").embeddings\n",
        "\n",
        "print(np.array(xq).shape)\n",
        "\n",
        "# query, returning the top 20 most similar results\n",
        "res = idx.query(xq, top_k=20, include_metadata=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pXhvxHwj5nX2",
        "outputId": "422948e1-a554-4bd2-9d28-076df6f5c3ee"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "0.48: On October 22, 2012, Microsoft announced the release of new features including co-authoring, performance improvements and touch support.\n",
            "0.45: On May 2, 2019, at F8, the company announced its new vision with the tagline \"the future is private\". A redesign of the website and mobile app was introduced, dubbed as \"FB5\". The event also featured plans for improving groups, a dating platform, end-to-end encryption on its platforms, and allowing users on Messenger to communicate directly with WhatsApp and Instagram users.\n",
            "0.42: On July 13, 2009, Microsoft announced at its Worldwide Partners Conference 2009 in New Orleans that Microsoft Office 2010 reached its \"Technical Preview\" development milestone and features of Office Web Apps were demonstrated to the public for the first time. Additionally, Microsoft announced that Office Web Apps would be made available to consumers online and free of charge, while Microsoft Software Assurance customers will have the option of running them on premises. Office 2010 beta testers were not given access to Office Web Apps at this date, and it was announced that it would be available for testers during August 2009. However, in August 2009, a Microsoft spokesperson stated that there had been a delay in the release of Office Web Apps Technical Preview and it would not be available by the end of August.\n",
            "0.42: On January 17, 2017, Facebook COO Sheryl Sandberg planned to open Station F, a startup incubator campus in Paris, France. On a six-month cycle, Facebook committed to work with ten to 15 data-driven startups there. On April 18, Facebook announced the beta launch of at its annual F8 developer conference. Facebook Spaces is a virtual reality version of Facebook for Oculus VR goggles. In a virtual and shared space, users can access a curated selection of 360-degree photos and videos using their avatar, with the support of the controller. Users can access their own photos and videos, along with media shared on their newsfeed. In September, Facebook announced it would spend up to US$1 billion on original shows for its Facebook Watch platform. On October 16, it acquired the anonymous compliment app tbh, announcing its intention to leave the app independent.\n",
            "0.41: On September 26, 2017, Microsoft announced that the next version of the suite for Windows desktop, Office 2019, was in development. On April 27, 2018, Microsoft released Office 2019 Commercial Preview for Windows 10. It was released to general availability for Windows 10 and for macOS on September 24, 2018.\n",
            "0.41: Microsoft Office, or simply Office, is the former name of a family of client software, server software, and services developed by Microsoft. It was first announced by Bill Gates on August 1, 1988, at COMDEX in Las Vegas. Initially a marketing term for an office suite (bundled set of productivity applications), the first version of Office contained Microsoft Word, Microsoft Excel, and Microsoft PowerPoint. Over the years, Office applications have grown substantially closer with shared features such as a common spell checker, Object Linking and Embedding data integration and Visual Basic for Applications scripting language. Microsoft also positions Office as a development platform for line-of-business software under the Office Business Applications brand.\n",
            "0.40: On August 12, 2009, it was announced that Office Mobile would also be released for the Symbian platform as a joint agreement between Microsoft and Nokia. It was the first time Microsoft would develop Office mobile applications for another smartphone platform. The first application to appear on Nokia Eseries smartphones was Microsoft Office Communicator. In February 2012, Microsoft released OneNote, Lync 2010, Document Connection and PowerPoint Broadcast for Symbian. In April, Word Mobile, PowerPoint Mobile and Excel Mobile joined the Office Suite.\n",
            "0.40: In 2010, Microsoft introduced a software as a service platform known as Office 365, to provide cloud-hosted versions of Office's server software, including Exchange e-mail and SharePoint, on a subscription basis (competing in particular with Google Apps). Following the release of Office 2013, Microsoft began to offer Office 365 plans for the consumer market, with access to Microsoft Office software on multiple devices with free feature updates over the life of the subscription, as well as other services such as OneDrive storage.\n",
            "0.40: On April 12, 2016, Zuckerberg outlined his 10-year vision, which rested on three main pillars: artificial intelligence, increased global connectivity, and virtual and augmented reality. In July, a suit was filed against the company alleging that it permitted Hamas to use it to perform assaults that cost the lives of four people. Facebook released its blueprints of Surround 360 camera on GitHub under an open-source license. In September, it won an Emmy for its animated short \"Henry\". In October, Facebook announced a fee-based communications tool called Workplace that aims to \"connect everyone\" at work. Users can create profiles, see updates from co-workers on their news feed, stream live videos and participate in secure group chats.\n",
            "0.40: On January 22, 2015, the Microsoft Office blog announced that the next version of the suite for Windows desktop, Office 2016, was in development. On May 4, 2015, a public preview of Microsoft Office 2016 was released. Office 2016 was released for Mac OS X on July 9, 2015 and for Windows on September 22, 2015.\n",
            "0.39: On November 6, 2013, Microsoft announced further new features including \"real-time\" co-authoring and an Auto-Save feature in Word (replacing the save button).\n",
            "0.39: In February 2014, Office Web Apps were re-branded Office Online and incorporated into other Microsoft web services, including Calendar, OneDrive, Outlook.com, and People. Microsoft had previously attempted to unify its online services suite (including Microsoft Passport, Hotmail, MSN Messenger, and later SkyDrive) under a brand known as Windows Live, first launched in 2005. However, with the impending launch of Windows 8 and its increased use of cloud services, Microsoft dropped the Windows Live brand to emphasize that these services would now be built directly into Windows and not merely be a \"bolted on\" add-on. Critics had criticized the Windows Live brand for having no clear vision, as it was being applied to an increasingly broad array of unrelated services. At the same time, Windows Live Hotmail was re-launched as Outlook.com (sharing its name with the Microsoft Outlook personal information manager).\n",
            "0.39: On February 18, 2021, Microsoft announced that the next version of the suite for Windows desktop, Office 2021, was in development. This new version will be supported for five years and was released on October 5, 2021.\n",
            "0.38: Since Office 2013, Microsoft has promoted Office 365 as the primary means of obtaining Microsoft Office: it allows the use of the software and other services on a subscription business model, and users receive feature updates to the software for the lifetime of the subscription, including new features and cloud computing integration that are not necessarily included in the \"on-premises\" releases of Office sold under conventional license terms. In 2017, revenue from Office 365 overtook conventional license sales. Microsoft also rebranded most of their standard Office 365 editions as \"Microsoft 365\" to reflect their inclusion of features and services beyond the core Microsoft Office suite.\n",
            "0.38: Microsoft has since promoted Office 365 as the primary means of purchasing Microsoft Office. Although there are still \"on-premises\" releases roughly every three years, Microsoft marketing emphasizes that they do not receive new features or access to new cloud-based services as they are released unlike Office 365, as well as other benefits for consumer and business markets. Office 365 revenue overtook traditional license sales for Office in 2017.\n",
            "0.38: A technical preview of Microsoft Office 2013 (Build 15.0.3612.1010) was released on January 30, 2012, and a Customer Preview version was made available to consumers on July 16, 2012. It sports a revamped application interface; the interface is based on Metro, the interface of Windows Phone and Windows 8. Microsoft Outlook has received the most pronounced changes so far; for example, the Metro interface provides a new visualization for scheduled tasks. PowerPoint includes more templates and transition effects, and OneNote includes a new splash screen.\n",
            "0.38: On January 21, 2015, during the \"Windows 10: The Next Chapter\" press event, Microsoft unveiled Office for Windows 10, Windows Runtime ports of the Android and iOS versions of the Office Mobile suite. Optimized for smartphones and tablets, they are universal apps that can run on both Windows and Windows for phones, and share similar underlying code. A simplified version of Outlook was also added to the suite. They will be bundled with Windows 10 mobile devices, and available from the Windows Store for the PC version of Windows 10. Although the preview versions were free for most editing, the release versions will require an Office 365 subscription on larger tablets (screen size larger than 10.1 inches) and desktops for editing, as with large Android tablets. Smaller tablets and phones will have most editing features for free.\n",
            "0.38: In May 2018 at F8, the company announced it would offer its own dating service. Shares in competitor Match Group fell by 22%. Facebook Dating includes privacy features and friends are unable to view their friends' dating profile. In July, Facebook was charged £500,000 by UK watchdogs for failing to respond to data erasure requests. On July 18, Facebook established a subsidiary named Lianshu Science & Technology in Hangzhou City, China, with $30 million ($ in dollars) of capital. All its shares are held by Facebook Hong. Approval of the registration of the subsidiary was then withdrawn, due to a disagreement between officials in Zhejiang province and the Cyberspace Administration of China. On July 26, Facebook became the first company to lose over $100 billion ($ in dollars) worth of market capitalization in one day, dropping from nearly $630 billion to $510 billion after disappointing sales reports. On July 31, Facebook said that the company had deleted 17 accounts related to the 2018 U.S. midterm elections. On September 19, Facebook announced that, for news distribution outside the United States, it would work with U.S. funded democracy promotion organizations, International Republican Institute and the National Democratic Institute, which are loosely affiliated with the Republican and Democratic parties. Through the Digital Forensic Research Lab Facebook partners with the Atlantic Council, a NATO-affiliated think tank. In November, Facebook launched smart displays branded Portal and Portal Plus (Portal+). They support Amazon's Alexa (intelligent personal assistant service). The devices include video chat function with Facebook Messenger.\n",
            "0.37: The first Preview version of Microsoft Office 2016 for Mac was released on March 5, 2015. On July 9, 2015, Microsoft released the final version of Microsoft Office 2016 for Mac which includes Word, Excel, PowerPoint, Outlook and OneNote. It was immediately made available for Office 365 subscribers with either a Home, Personal, Business, Business Premium, E3 or ProPlus subscription. A non–Office 365 edition of Office 2016 was made available as a one-time purchase option on September 22, 2015.\n",
            "0.37: In October 2022, Microsoft announced that it will phase out the Microsoft Office brand in favor of \"Microsoft 365\" by January 2023. The name will continue to be used for legacy product offerings.\n"
          ]
        }
      ],
      "source": [
        "# Look at the initial retrieval results\n",
        "for match in res['matches']:\n",
        "    print(f\"{match['score']:.2f}: {match['metadata']['text']}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DyevqtBB8KyU"
      },
      "source": [
        "## Step 7: Rerank the retrieved results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Rqo6YdzJI0ny",
        "outputId": "aac1de9a-9cae-4c4c-f4a3-fb87583f62f2"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "0.99: Microsoft Office, or simply Office, is the former name of a family of client software, server software, and services developed by Microsoft. It was first announced by Bill Gates on August 1, 1988, at COMDEX in Las Vegas. Initially a marketing term for an office suite (bundled set of productivity applications), the first version of Office contained Microsoft Word, Microsoft Excel, and Microsoft PowerPoint. Over the years, Office applications have grown substantially closer with shared features such as a common spell checker, Object Linking and Embedding data integration and Visual Basic for Applications scripting language. Microsoft also positions Office as a development platform for line-of-business software under the Office Business Applications brand.\n",
            "0.93: On January 21, 2015, during the \"Windows 10: The Next Chapter\" press event, Microsoft unveiled Office for Windows 10, Windows Runtime ports of the Android and iOS versions of the Office Mobile suite. Optimized for smartphones and tablets, they are universal apps that can run on both Windows and Windows for phones, and share similar underlying code. A simplified version of Outlook was also added to the suite. They will be bundled with Windows 10 mobile devices, and available from the Windows Store for the PC version of Windows 10. Although the preview versions were free for most editing, the release versions will require an Office 365 subscription on larger tablets (screen size larger than 10.1 inches) and desktops for editing, as with large Android tablets. Smaller tablets and phones will have most editing features for free.\n",
            "0.87: In October 2022, Microsoft announced that it will phase out the Microsoft Office brand in favor of \"Microsoft 365\" by January 2023. The name will continue to be used for legacy product offerings.\n"
          ]
        }
      ],
      "source": [
        "# Add Cohere Reranking Step\n",
        "docs =[match['metadata']['text'] for match in res['matches']]\n",
        "\n",
        "rerank_response = co.rerank(\n",
        "  model = 'rerank-english-v2.0',\n",
        "  query = query,\n",
        "  documents = docs,\n",
        "  top_n = 3,\n",
        ")\n",
        "for response in rerank_response:\n",
        "  print(f\"{response.relevance_score:.2f}: {response.document['text']}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Another example - query and rerank"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2laQ-A5x9HY8",
        "outputId": "c3d831fd-5df7-412e-8637-9c51c51c0846"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "(1, 1024)\n",
            "0.66: YouTube began as a venture capital–funded technology startup. Between November 2005 and April 2006, the company raised money from various investors, with Sequoia Capital, $11.5 million, and Artis Capital Management, $8 million, being the largest two. YouTube's early headquarters were situated above a pizzeria and a Japanese restaurant in San Mateo, California. In February 2005, the company activated codice_1. The first video was uploaded April 23, 2005. Titled \"Me at the zoo\", it shows co-founder Jawed Karim at the San Diego Zoo and can still be viewed on the site. In May, the company launched a public beta and by November, a Nike ad featuring Ronaldinho became the first video to reach one million total views. The site launched officially on December 15, 2005, by which time the site was receiving 8 million views a day. Clips at the time were limited to 100 megabytes, as little as 30 seconds of footage.\n",
            "0.58: Karim said the inspiration for YouTube first came from the Super Bowl XXXVIII halftime show controversy when Janet Jackson's breast was briefly exposed by Justin Timberlake during the halftime show. Karim could not easily find video clips of the incident and the 2004 Indian Ocean Tsunami online, which led to the idea of a video-sharing site. Hurley and Chen said that the original idea for YouTube was a video version of an online dating service, and had been influenced by the website Hot or Not. They created posts on Craigslist asking attractive women to upload videos of themselves to YouTube in exchange for a $100 reward. Difficulty in finding enough dating videos led to a change of plans, with the site's founders deciding to accept uploads of any video.\n",
            "0.55: YouTube was not the first video-sharing site on the Internet; Vimeo was launched in November 2004, though that site remained a side project of its developers from CollegeHumor at the time and did not grow much, either. The week of YouTube's launch, NBC-Universal's \"Saturday Night Live\" ran a skit \"Lazy Sunday\" by The Lonely Island. Besides helping to bolster ratings and long-term viewership for \"Saturday Night Live\", \"Lazy Sunday\"'s status as an early viral video helped establish YouTube as an important website. Unofficial uploads of the skit to YouTube drew in more than five million collective views by February 2006 before they were removed when NBCUniversal requested it two months later based on copyright concerns. Despite eventually being taken down, these duplicate uploads of the skit helped popularize YouTube's reach and led to the upload of more third-party content. The site grew rapidly; in July 2006, the company announced that more than 65,000 new videos were being uploaded every day and that the site was receiving 100 million video views per day.\n",
            "0.55: According to a story that has often been repeated in the media, Hurley and Chen developed the idea for YouTube during the early months of 2005, after they had experienced difficulty sharing videos that had been shot at a dinner party at Chen's apartment in San Francisco. Karim did not attend the party and denied that it had occurred, but Chen remarked that the idea that YouTube was founded after a dinner party \"was probably very strengthened by marketing ideas around creating a story that was very digestible\".\n",
            "0.53: In December 2009, YouTube partnered with Vevo. In April 2010, Lady Gaga's \"Bad Romance\" became the most viewed video, becoming the first video to reach 200 million views on May 9, 2010.\n",
            "0.53: YouTube is a global online video sharing and social media platform headquartered in San Bruno, California. It was launched on February 14, 2005, by Steve Chen, Chad Hurley, and Jawed Karim. It is owned by Google, and is the second most visited website, after Google Search. YouTube has more than 2.5 billion monthly users who collectively watch more than one billion hours of videos each day. , videos were being uploaded at a rate of more than 500 hours of content per minute.\n",
            "0.53: YouTube has faced numerous challenges and criticisms in its attempts to deal with copyright, including the site's first viral video, Lazy Sunday, which had to be taken down, due to copyright concerns. At the time of uploading a video, YouTube users are shown a message asking them not to violate copyright laws. Despite this advice, many unauthorized clips of copyrighted material remain on YouTube. YouTube does not view videos before they are posted online, and it is left to copyright holders to issue a DMCA takedown notice pursuant to the terms of the Online Copyright Infringement Liability Limitation Act. Any successful complaint about copyright infringement results in a YouTube copyright strike. Three successful complaints for copyright infringement against a user account will result in the account and all of its uploaded videos being deleted. From 2007 to 2009 organizations including Viacom, Mediaset, and the English Premier League have filed lawsuits against YouTube, claiming that it has done too little to prevent the uploading of copyrighted material.\n",
            "0.51: Some YouTube videos have themselves had a direct effect on world events, such as \"Innocence of Muslims\" (2012) which spurred protests and related anti-American violence internationally. TED curator Chris Anderson described a phenomenon by which geographically distributed individuals in a certain field share their independently developed skills in YouTube videos, thus challenging others to improve their own skills, and spurring invention and evolution in that field. Journalist Virginia Heffernan stated in \"The New York Times\" that such videos have \"surprising implications\" for the dissemination of culture and even the future of classical music.\n",
            "0.50: Observing that face-to-face communication of the type that online videos convey has been \"fine-tuned by millions of years of evolution,\" TED curator Chris Anderson referred to several YouTube contributors and asserted that \"what Gutenberg did for writing, online video can now do for face-to-face communication.\" Anderson asserted that it is not far-fetched to say that online video will dramatically accelerate scientific advance, and that video contributors may be about to launch \"the biggest learning cycle in human history.\" In education, for example, the Khan Academy grew from YouTube video tutoring sessions for founder Salman Khan's cousin into what \"Forbes\" Michael Noer called \"the largest school in the world,\" with technology poised to disrupt how people learn. YouTube was awarded a 2008 George Foster Peabody Award, the website being described as a Speakers' Corner that \"both embodies and promotes democracy.\" \"The Washington Post\" reported that a disproportionate share of YouTube's most subscribed channels feature minorities, contrasting with mainstream television in which the stars are largely white. A Pew Research Center study reported the development of \"visual journalism,\" in which citizen eyewitnesses and established news organizations share in content creation. The study also concluded that YouTube was becoming an important platform by which people acquire news.\n",
            "0.50: YouTube was founded by Steve Chen, Chad Hurley, and Jawed Karim. The trio were early employees of PayPal, which left them enriched after the company was bought by eBay. Hurley had studied design at the Indiana University of Pennsylvania, and Chen and Karim studied computer science together at the University of Illinois Urbana-Champaign.\n",
            "0.49: In 2013, YouTube teamed up with satirical newspaper company \"The Onion\" to claim in an uploaded video that the video-sharing website was launched as a contest which had finally come to an end, and would shut down for ten years before being re-launched in 2023, featuring only the winning video. The video starred several YouTube celebrities, including Antoine Dodson. A video of two presenters announcing the nominated videos streamed live for 12 hours.\n",
            "0.48: Since its purchase by Google, YouTube has expanded beyond the core website into mobile apps, network television, and the ability to link with other platforms. Video categories on YouTube include music videos, video clips, news, short films, feature films, documentaries, audio recordings, movie trailers, teasers, live streams, vlogs, and more. Most content is generated by individuals, including collaborations between YouTubers and corporate sponsors. Established media corporations such as Disney, Paramount, and Warner Bros. Discovery have also created and expanded their corporate YouTube channels to advertise to a larger audience.\n",
            "0.47: YouTube has enabled people to more directly engage with government, such as in the CNN/YouTube presidential debates (2007) in which ordinary people submitted questions to U.S. presidential candidates via YouTube video, with a techPresident co-founder saying that Internet video was changing the political landscape. Describing the Arab Spring (2010–2012), sociologist Philip N. Howard quoted an activist's succinct description that organizing the political unrest involved using \"Facebook to schedule the protests, Twitter to coordinate, and YouTube to tell the world.\" In 2012, more than a third of the U.S. Senate introduced a resolution condemning Joseph Kony 16 days after the \"Kony 2012\" video was posted to YouTube, with resolution co-sponsor Senator Lindsey Graham remarking that the video \"will do more to lead to (Kony's) demise than all other action combined.\"\n",
            "0.47: YouTube carried out early experiments with live streaming, including a concert by U2 in 2009, and a question-and-answer session with US President Barack Obama in February 2010. These tests had relied on technology from 3rd-party partners, but in September 2010, YouTube began testing its own live streaming infrastructure. In April 2011, YouTube announced the rollout of \"YouTube Live\". The creation of live streams was initially limited to select partners. It was used for real-time broadcasting of events such as the 2012 Olympics in London. In October 2012, more than 8 million people watched Felix Baumgartner's jump from the edge of space as a live stream on YouTube.\n",
            "0.46: In June 2007, YouTube began trials of a system for automatic detection of uploaded videos that infringe copyright. Google CEO Eric Schmidt regarded this system as necessary for resolving lawsuits such as the one from Viacom, which alleged that YouTube profited from content that it did not have the right to distribute. The system, which was initially called \"Video Identification\" and later became known as Content ID, creates an ID File for copyrighted audio and video material, and stores it in a database. When a video is uploaded, it is checked against the database, and flags the video as a copyright violation if a match is found. When this occurs, the content owner has the choice of blocking the video to make it unviewable, tracking the viewing statistics of the video, or adding advertisements to the video.\n",
            "0.46: In January 2009, YouTube launched \"YouTube for TV\", a version of the website tailored for set-top boxes and other TV-based media devices with web browsers, initially allowing its videos to be viewed on the PlayStation 3 and Wii video game consoles.\n",
            "0.46: In September 2012, YouTube launched its first app for the iPhone, following the decision to drop YouTube as one of the preloaded apps in the iPhone 5 and iOS 6 operating system. According to GlobalWebIndex, YouTube was used by 35% of smartphone users between April and June 2013, making it the third-most used app.\n",
            "0.46: Conversely, YouTube has also allowed government to more easily engage with citizens, the White House's official YouTube channel being the seventh top news organization producer on YouTube in 2012 and in 2013 a healthcare exchange commissioned Obama impersonator Iman Crosson's YouTube music video spoof to encourage young Americans to enroll in the Affordable Care Act (Obamacare)-compliant health insurance. In February 2014, U.S. President Obama held a meeting at the White House with leading YouTube content creators to not only promote awareness of Obamacare but more generally to develop ways for government to better connect with the \"YouTube Generation.\" Whereas YouTube's inherent ability to allow presidents to directly connect with average citizens was noted, the YouTube content creators' new media savvy was perceived necessary to better cope with the website's distracting content and fickle audience.\n",
            "0.46: Later that year, YouTube came under criticism for showing inappropriate videos targeted at children and often featuring popular characters in violent, sexual or otherwise disturbing situations, many of which appeared on YouTube Kids and attracted millions of views. The term \"Elsagate\" was coined on the Internet and then used by various news outlets to refer to this controversy. On November 11, 2017, YouTube announced it was strengthening site security to protect children from unsuitable content. Later that month, the company started to mass delete videos and channels that made improper use of family-friendly characters. As part of a broader concern regarding child safety on YouTube, the wave of deletions also targeted channels that showed children taking part in inappropriate or dangerous activities under the guidance of adults. Most notably, the company removed \"Toy Freaks\", a channel with over 8.5 million subscribers, that featured a father and his two daughters in odd and upsetting situations. According to analytics specialist SocialBlade, it earned up to £8.7 million annually prior to its deletion.\n",
            "0.45: In September 2020, YouTube announced that it would be launching a beta version of a new platform of 15-second videos, similar to TikTok, called YouTube Shorts. The platform was first tested in India but as of March 2021 has expanded to other countries including the United States with videos now able to be up to 1 minute long. The platform is not a standalone app, but is integrated into the main YouTube app. Like TikTok, it gives users access to built-in creative tools, including the possibility of adding licensed music to their videos. The platform had its global beta launch in July 2021.\n"
          ]
        }
      ],
      "source": [
        "# Let's query the database\n",
        "query = \"What was the first youtube video about?\"\n",
        "\n",
        "# create the query embedding\n",
        "xq = co.embed(\n",
        "    texts=[query],\n",
        "    model='embed-english-v3.0',\n",
        "    input_type='search_query',\n",
        "    truncate='END'\n",
        ").embeddings\n",
        "\n",
        "print(np.array(xq).shape)\n",
        "\n",
        "# query, returning the top 20 most similar results\n",
        "res = idx.query(xq, top_k=20, include_metadata=True)\n",
        "\n",
        "# Look at the initial retrieval results\n",
        "for match in res['matches']:\n",
        "    print(f\"{match['score']:.2f}: {match['metadata']['text']}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WaN6U6u19Md6",
        "outputId": "9984d8ce-1383-4530-ec84-6e3f3feea05e"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "0.95: YouTube began as a venture capital–funded technology startup. Between November 2005 and April 2006, the company raised money from various investors, with Sequoia Capital, $11.5 million, and Artis Capital Management, $8 million, being the largest two. YouTube's early headquarters were situated above a pizzeria and a Japanese restaurant in San Mateo, California. In February 2005, the company activated codice_1. The first video was uploaded April 23, 2005. Titled \"Me at the zoo\", it shows co-founder Jawed Karim at the San Diego Zoo and can still be viewed on the site. In May, the company launched a public beta and by November, a Nike ad featuring Ronaldinho became the first video to reach one million total views. The site launched officially on December 15, 2005, by which time the site was receiving 8 million views a day. Clips at the time were limited to 100 megabytes, as little as 30 seconds of footage.\n",
            "0.92: Karim said the inspiration for YouTube first came from the Super Bowl XXXVIII halftime show controversy when Janet Jackson's breast was briefly exposed by Justin Timberlake during the halftime show. Karim could not easily find video clips of the incident and the 2004 Indian Ocean Tsunami online, which led to the idea of a video-sharing site. Hurley and Chen said that the original idea for YouTube was a video version of an online dating service, and had been influenced by the website Hot or Not. They created posts on Craigslist asking attractive women to upload videos of themselves to YouTube in exchange for a $100 reward. Difficulty in finding enough dating videos led to a change of plans, with the site's founders deciding to accept uploads of any video.\n",
            "0.91: YouTube was not the first video-sharing site on the Internet; Vimeo was launched in November 2004, though that site remained a side project of its developers from CollegeHumor at the time and did not grow much, either. The week of YouTube's launch, NBC-Universal's \"Saturday Night Live\" ran a skit \"Lazy Sunday\" by The Lonely Island. Besides helping to bolster ratings and long-term viewership for \"Saturday Night Live\", \"Lazy Sunday\"'s status as an early viral video helped establish YouTube as an important website. Unofficial uploads of the skit to YouTube drew in more than five million collective views by February 2006 before they were removed when NBCUniversal requested it two months later based on copyright concerns. Despite eventually being taken down, these duplicate uploads of the skit helped popularize YouTube's reach and led to the upload of more third-party content. The site grew rapidly; in July 2006, the company announced that more than 65,000 new videos were being uploaded every day and that the site was receiving 100 million video views per day.\n"
          ]
        }
      ],
      "source": [
        "# Add Cohere Reranking Step\n",
        "# embeds=[np.float32(data_array[i]['embedding']) for i in range(len(data_array))]\n",
        "docs =[match['metadata']['text'] for match in res['matches']]\n",
        "\n",
        "rerank_response = co.rerank(\n",
        "  model = 'rerank-english-v2.0',\n",
        "  query = query,\n",
        "  documents = docs,\n",
        "  top_n = 3,\n",
        ")\n",
        "for response in rerank_response:\n",
        "  print(f\"{response.relevance_score:.2f}: {response.document['text']}\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
